R setup

# Load libraries
library(plyr)
library(tidyverse)
library(edgeR)
library(AnnotationHub)
library(magrittr)
library(scales)
library(pander)
library(ggrepel)
library(fgsea)
library(pheatmap)
library(igraph)
library(tidygraph)
library(ggraph)
library(grid)
library(RUVSeq)

2019 data analysis

Differential Expression Analysis

Load data

# Load counts analysed by feature counts
counts <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  dplyr::select(Geneid, starts_with("W"), starts_with("Q"))

Create DGEList

# Create DGEList and calculate normalisaton factors
dgeList <- counts %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variable
dgeList$samples$group <- colnames(dgeList) %>%
  str_extract("(W|Q)") %>%
  factor(levels = c("W", "Q"))

Add gene information

# Add AnnotationHub and subset to search for zebrafish
ah <- AnnotationHub()
ah %>%
  subset(species == "Danio rerio") %>%
  subset(dataprovider == "Ensembl") %>%
  subset(rdataclass == "EnsDb")
## AnnotationHub with 11 records
## # snapshotDate(): 2019-05-02 
## # $dataprovider: Ensembl
## # $species: Danio rerio
## # $rdataclass: EnsDb
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## #   tags, rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH53201"]]' 
## 
##             title                           
##   AH53201 | Ensembl 87 EnsDb for Danio Rerio
##   AH53705 | Ensembl 88 EnsDb for Danio Rerio
##   AH56671 | Ensembl 89 EnsDb for Danio Rerio
##   AH57746 | Ensembl 90 EnsDb for Danio Rerio
##   AH60762 | Ensembl 91 EnsDb for Danio Rerio
##   ...       ...                             
##   AH64434 | Ensembl 93 EnsDb for Danio Rerio
##   AH64906 | Ensembl 94 EnsDb for Danio rerio
##   AH67932 | Ensembl 95 EnsDb for Danio rerio
##   AH69169 | Ensembl 96 EnsDb for Danio rerio
##   AH73861 | Ensembl 97 EnsDb for Danio rerio
# Select correct Ensembl release
ensDb <- ah[["AH64906"]]
# Extract GenomicRanges object from ensDb
genesGR <- genes(ensDb)
# Remove redundant columns from mcols
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name", 
                                   "gene_biotype", "entrezid")]
# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeList$genes <- genesGR[rownames(dgeList),]

Data QC

# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
  rowSums() %>%
  is_greater_than(0) %>%
  table()
## .
## FALSE  TRUE 
##  3927 28130
# Check for genes having > 4 samples with cpm > 1
dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4) %>%
  table()
## .
## FALSE  TRUE 
## 13704 18353
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Library sizes

# Check library sizes with box plot
dgeFilt$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library Size") +
  scale_fill_discrete(
    name ="Genotype", 
    labels = c("Wildtype", "Mutant")
  ) +
  scale_x_discrete(labels=c("W" = "Wildtype", "Q" = "Mutant")) +
  theme_bw()

PCA

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 22.27 18.07 16.75 14.73 14.45 13.34 11.87 11.2 5.671e-14
Proportion of Variance 0.2513 0.1655 0.1421 0.1099 0.1058 0.09023 0.07145 0.06362 0
Cumulative Proportion 0.2513 0.4168 0.559 0.6689 0.7747 0.8649 0.9364 1 1
# Plot PCA
pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFilt$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Differential expression

# Create model matrix
design <- model.matrix(~group, data = dgeFilt$samples)
# Perform exact test on DGEList
topTable <- dgeFilt %>%
  estimateDisp(design = design) %>%
  exactTest() %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  dplyr::select(
    Geneid = ID.gene_id, 
    Symbol = ID.gene_name,
    AveExpr = logCPM, logFC, 
    P.Value = PValue, 
    FDR, Location, 
    Entrez = ID.entrezid
  ) %>%
  mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTable %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(-log10(P.Value) > 4 | abs(logFC) >                        2.5), aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  theme_bw() +
  theme(legend.position = "none")

# MD Plot showing DE genes
topTable %>%
  dplyr::arrange(desc(P.Value)) %>%
  ggplot(aes(AveExpr, logFC, colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(
    data = . %>% 
      dplyr::filter(DE) %>%
      dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
    aes(label = Symbol)
  ) + 
  scale_colour_manual(values = c("grey", "red")) +
  labs(x = "Average Expression (log2 CPM)",
       y = "log Fold-Change") +
  theme_bw() +
  theme(legend.position = "none")

# Summary of DE genes
topTableDE <- topTable %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) 
topTableDE %>% pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR
ENSDARG00000091368 AL954327.1 2.656 -5.923 1.214e-08 0.0002228
ENSDARG00000093214 si:ch211-284e13.9 0.8189 1.542 6.086e-07 0.005585
ENSDARG00000037421 egr1 8.527 -0.712 9.887e-07 0.006049
ENSDARG00000017246 prx 2.326 -2.614 1.823e-06 0.008364
ENSDARG00000089477 si:ch211-132g1.3 5.855 0.6079 3.135e-06 0.01089
ENSDARG00000089382 zgc:158463 5.631 0.6536 3.561e-06 0.01089
ENSDARG00000080337 NC_002333.4 11.17 0.4449 5.344e-06 0.01401
ENSDARG00000096829 blvrb 3.025 -1.521 1.643e-05 0.03386
ENSDARG00000093438 CU467110.1 4.596 0.5459 1.752e-05 0.03386
ENSDARG00000091916 ugt5b4 -0.0384 -1.445 1.994e-05 0.03386
ENSDARG00000036304 dnaaf3l 1.358 -1.44 2.029e-05 0.03386

GO Enrichment

ens2Entrez <- file.path("https://uofabioinformaticshub.github.io/Intro-NGS-fib",
                        "data", "ens2Entrez.tsv") %>% 
  url() %>%
  read_tsv()
de <- topTable %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid) %>%
  left_join(ens2Entrez) %>%
  dplyr::filter(!is.na(Entrez)) %>%
  .[["Entrez"]] %>%
  unique()
uv <- topTable %>%
  dplyr::select(Geneid) %>%
  left_join(ens2Entrez) %>%
  dplyr::filter(!is.na(Entrez)) %>%
  .[["Entrez"]] %>%
  unique()
goResults <- goana(de = de, universe = uv, species = "Hs")
goResults %>% 
  rownames_to_column("GO ID") %>%
  as_tibble() %>%
  dplyr::filter(DE > 1) %>%
  dplyr::arrange(P.DE) %>%
  mutate(FDR = p.adjust(P.DE, "fdr")) %>%
  dplyr::filter(FDR < 0.05) %>%
  mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
  pander(caption = "GO Terms potentially enriched in the set of differentially expressed genes")
GO Terms potentially enriched in the set of differentially expressed genes
GO ID Term Ont N DE P.DE FDR

Gene Set Enrichment Analysis (GSEA)

Setting up ID conversion, ranks and pathways

# Load id conversion file
idConvert <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(Geneid = zfID, EntrezID = Entrez) %>%
  mutate(EntrezID = as.character(EntrezID))
# Create function to convert ids (Not sure how this works, Steve wrote it)
convertHsEG2Dr <- function(ids, df = idConvert){
  dplyr::filter(df, EntrezID %in% ids)$Geneid
}
# Conversion of zebrafish ensembl ID to zebrafish symbol, for plotting on network analyses
idConvertSymbol <- read_csv2("../files/zf2human_withEntrezIDs.csv") %>%
  dplyr::select(label = zfID, symbol = zfName) %>%
  na.omit() %>%
  unique()
# Create named vector of gene level statistics 
ranks <- topTable %>%
  mutate(stat = -sign(logFC) * log10(P.Value)) %>%
  dplyr::arrange(desc(stat)) %>%
  with(structure(stat, names = Geneid))
# Import hallmark human gene genesets and tidy gene set names
# .gmt files downloaded from:
# http://software.broadinstitute.org/gsea/downloads.jsp 
# http://data.wikipathways.org/20190610/ 
hallmark <- gmtPathways("../files/h.all.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "HALLMARK_"))
kegg <- gmtPathways("../files/c2.cp.kegg.v6.2.entrez.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "KEGG_"))
wiki <- gmtPathways("../files/wikipathways-20190610-gmt-Homo_sapiens.gmt") %>%
  mclapply(convertHsEG2Dr, mc.cores = 4) %>%
  set_names(str_remove_all(names(.), "%.+"))

Hallmark

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmark <- fgsea(hallmark, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkTop <- fgseaHallmark %>%
  dplyr::filter(padj < 0.05)
fgseaHallmarkTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 13 most significantly enriched Hallmark pathways. This corresponds to an FDR of 0.284%
pathway pval FDR ES NES size padj
MYC_TARGETS_V1 1.236e-05 0.0001011 0.6076 2.418 220 0.0006178
OXIDATIVE_PHOSPHORYLATION 1.237e-05 0.0001011 0.6488 2.58 219 0.0006184
INTERFERON_GAMMA_RESPONSE 1.251e-05 0.0001011 0.5684 2.242 200 0.0006256
E2F_TARGETS 1.254e-05 0.0001011 0.464 1.827 197 0.0006268
ALLOGRAFT_REJECTION 1.256e-05 0.0001011 0.5472 2.153 195 0.0006278
DNA_REPAIR 1.304e-05 0.0001011 0.5417 2.069 150 0.0006519
INTERFERON_ALPHA_RESPONSE 1.415e-05 0.0001011 0.5773 2.034 83 0.0007075
MYOGENESIS 5.222e-05 0.0003264 -0.4095 -1.859 216 0.002611
WNT_BETA_CATENIN_SIGNALING 6.038e-05 0.0003354 -0.5614 -2.034 52 0.003019
ADIPOGENESIS 0.0001853 0.0008935 0.4063 1.617 220 0.009267
MTORC1_SIGNALING 0.0001966 0.0008935 0.4066 1.624 228 0.009829
G2M_CHECKPOINT 0.0005689 0.002371 0.3955 1.572 216 0.02845
KRAS_SIGNALING_DN 0.0007387 0.002841 -0.3669 -1.599 155 0.03694
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  hallmark[fgseaHallmark$pathway], ranks, fgseaHallmark, gseaParam = 0.5
)

KEGG

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for KEGG
fgseaKEGG <- fgsea(kegg, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGTop <- fgseaKEGG %>%
  dplyr::filter(padj < 0.05)
fgseaKEGGTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 29 most significantly enriched KEGG pathways. This corresponds to an FDR of 0.171%
pathway pval FDR ES NES size padj
CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 1.207e-05 0.0002836 0.4479 1.812 260 0.002246
CHEMOKINE_SIGNALING_PATHWAY 1.268e-05 0.0002836 0.6442 2.524 187 0.002358
HUNTINGTONS_DISEASE 1.276e-05 0.0002836 0.5849 2.281 180 0.002373
ALZHEIMERS_DISEASE 1.293e-05 0.0002836 0.555 2.141 164 0.002406
SPLICEOSOME 1.333e-05 0.0002836 0.6179 2.32 131 0.002479
OXIDATIVE_PHOSPHORYLATION 1.337e-05 0.0002836 0.7172 2.685 128 0.002486
PARKINSONS_DISEASE 1.344e-05 0.0002836 0.7061 2.63 123 0.002501
RIBOSOME 1.426e-05 0.0002836 0.8662 3.032 79 0.002653
NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY 1.468e-05 0.0002836 0.6316 2.126 62 0.00273
PROTEASOME 1.525e-05 0.0002836 0.7112 2.259 45 0.002836
LINOLEIC_ACID_METABOLISM 2.872e-05 0.0004856 -0.6266 -2.178 43 0.005342
ECM_RECEPTOR_INTERACTION 3.345e-05 0.0005089 -0.5644 -2.212 79 0.006222
ASCORBATE_AND_ALDARATE_METABOLISM 4.629e-05 0.0005089 -0.7435 -3.304 181 0.00861
PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS 4.688e-05 0.0005089 -0.7556 -3.364 184 0.008719
STARCH_AND_SUCROSE_METABOLISM 4.85e-05 0.0005089 -0.7369 -3.307 196 0.00902
PORPHYRIN_AND_CHLOROPHYLL_METABOLISM 4.877e-05 0.0005089 -0.7057 -3.175 199 0.009071
STEROID_HORMONE_BIOSYNTHESIS 5.1e-05 0.0005089 -0.695 -3.152 213 0.009486
FOCAL_ADHESION 5.198e-05 0.0005089 -0.4533 -2.062 219 0.009669
DRUG_METABOLISM_OTHER_ENZYMES 5.222e-05 0.0005089 -0.6915 -3.146 220 0.009714
DRUG_METABOLISM_CYTOCHROME_P450 5.856e-05 0.0005089 -0.6977 -3.24 262 0.01089
RETINOL_METABOLISM 5.896e-05 0.0005089 -0.6954 -3.231 264 0.01097
METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450 6.019e-05 0.0005089 -0.6907 -3.216 270 0.01119
PATHWAYS_IN_CANCER 7.027e-05 0.0005683 -0.3296 -1.568 334 0.01307
SYSTEMIC_LUPUS_ERYTHEMATOSUS 7.645e-05 0.0005925 0.61 1.93 44 0.01422
AXON_GUIDANCE 0.0001317 0.0009798 -0.3794 -1.663 162 0.0245
JAK_STAT_SIGNALING_PATHWAY 0.0001705 0.00122 -0.3899 -1.693 151 0.03172
ARRHYTHMOGENIC_RIGHT_VENTRICULAR_CARDIOMYOPATHY_ARVC 0.0002086 0.001437 -0.4541 -1.817 89 0.03881
PATHOGENIC_ESCHERICHIA_COLI_INFECTION 0.000225 0.001495 0.5804 1.894 52 0.04185
NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 0.0002659 0.001706 -0.3377 -1.544 228 0.04946
# Make a table plot of significant KEGG pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  kegg[fgseaKEGG$pathway], ranks, fgseaKEGG, gseaParam = 0.5
)

WikiPathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for WikiPathways
fgseaWiki <- fgsea(wiki, ranks, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiTop <- fgseaWiki %>%
  dplyr::filter(padj < 0.05)
fgseaWikiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Wiki pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 24 most significantly enriched Wiki pathways. This corresponds to an FDR of 0.181%
pathway pval FDR ES NES size padj
Chemokine signaling pathway 1.284e-05 0.0009715 0.5945 2.303 170 0.006691
Nonalcoholic fatty liver disease 1.299e-05 0.0009715 0.5834 2.239 156 0.006768
mRNA Processing 1.328e-05 0.0009715 0.5371 2.022 133 0.006919
Electron Transport Chain (OXPHOS system in mitochondria) 1.382e-05 0.0009715 0.7417 2.677 97 0.007198
Cytoplasmic Ribosomal Proteins 1.416e-05 0.0009715 0.8704 3.052 80 0.007377
Parkin-Ubiquitin Proteasomal System pathway 1.436e-05 0.0009715 0.6106 2.101 71 0.007483
Oxidative phosphorylation 1.466e-05 0.0009715 0.7527 2.517 60 0.007635
Mitochondrial complex I assembly model OXPHOS system 1.492e-05 0.0009715 0.7502 2.444 52 0.007772
Nicotine Metabolism 2.591e-05 0.001041 -0.768 -2.255 21 0.0135
Striated Muscle Contraction Pathway 2.788e-05 0.001041 -0.6834 -2.256 34 0.01452
Irinotecan Pathway 2.848e-05 0.001041 -0.7029 -2.396 39 0.01484
Proteasome Degradation 2.926e-05 0.001041 0.5898 1.979 61 0.01525
Estrogen metabolism 3.135e-05 0.001041 -0.645 -2.398 59 0.01633
Pregnane X Receptor pathway 3.404e-05 0.001041 -0.6214 -2.444 80 0.01773
Constitutive Androstane Receptor Pathway 3.417e-05 0.001041 -0.6225 -2.452 81 0.0178
Tamoxifen metabolism 3.485e-05 0.001041 -0.6519 -2.594 86 0.01816
Aryl Hydrocarbon Receptor Pathway 3.544e-05 0.001041 -0.6158 -2.472 91 0.01846
Codeine and Morphine Metabolism 3.598e-05 0.001041 -0.7161 -2.895 95 0.01875
TGF-beta Signaling Pathway 4.232e-05 0.00116 -0.4216 -1.822 148 0.02205
Glucuronidation 4.759e-05 0.001205 -0.7632 -3.401 187 0.02479
Focal Adhesion 5.09e-05 0.001205 -0.4598 -2.085 213 0.02652
NRF2 pathway 5.09e-05 0.001205 -0.4504 -2.043 213 0.02652
Metapathway biotransformation Phase I and II 7.519e-05 0.001703 -0.5239 -2.51 360 0.03918
Nuclear Receptors Meta-Pathway 8.361e-05 0.001815 -0.3388 -1.64 404 0.04356
# Make a table plot of significant WikiPathways pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(
  wiki[fgseaWiki$pathway], ranks, fgseaWiki, gseaParam = 0.5
)

Network Analysis

Hallmark

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmark <-
  fgseaHallmarkTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysHallmark <- names(sigHallmark) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesHallmark <- unique(unlist(sigHallmark)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesHallmark <- full_join(pathwaysHallmark, genesHallmark, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesHallmark <- ldply(sigHallmark, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesHallmark, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesHallmark, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyHallmark <- 
  tbl_graph(nodes = nodesHallmark, edges = edgesHallmark, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaHallmarkTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDE$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTable$Geneid ~ 
        as.integer(row_number(label %in% topTable$Geneid)), 
      id <= nrow(fgseaHallmarkTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[id],
      label %in% topTableDE$Geneid ~ "black"),
    hjust = case_when(
      DE == "ugt5b4" ~ as.integer(0)),
    vjust = case_when(DE == "ugt5b4" ~ as.integer(5))
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaHallmarkTop) ~ rainbow(nrow(fgseaHallmarkTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmark, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaHallmarkTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour),
    shape = 21,
    stroke = 0.5, 
    
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways),
    repel = TRUE,
    size = 3, 
    alpha = 0.7,
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE,
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

KEGG

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGG <- 
  fgseaKEGGTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>%
  lapply(unlist)
# Create a node list
pathwaysKEGG <- names(sigKEGG) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesKEGG <- unique(unlist(sigKEGG)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesKEGG <- full_join(pathwaysKEGG, genesKEGG, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesKEGG <- ldply(sigKEGG, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesKEGG, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesKEGG, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyKEGG <- 
  tbl_graph(nodes = nodesKEGG, edges = edgesKEGG, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaKEGGTop) ~ label
    ),
    DE = case_when(label %in% topTableDE$Geneid ~ symbol),
    size = case_when(
      label %in% topTable$Geneid ~ row_number(label %in% topTable$Geneid), 
      id <= nrow(fgseaKEGGTop) ~ 4000L
    ) %>% as.integer(),
    colour = case_when(
      id <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[id],
      label %in% topTableDE$Geneid ~ "black"
    ),
    hjust = case_when(
      DE == "ugt5b4" ~ -1L,
      DE == "blvrb" ~ 7L
    ),
    vjust = case_when(
      DE == "ugt5b4" ~ 7L,
      DE == "blvrb" ~ 0L
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaKEGGTop) ~ rainbow(nrow(fgseaKEGGTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(26)
# Plot graph
ggraph(tidyKEGG, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaKEGGTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways),
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE,
    size = 3,
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

WikiPathways

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWiki <- 
  fgseaWikiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysWiki <- names(sigWiki) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesWiki <- unique(unlist(sigWiki)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesWiki <- full_join(pathwaysWiki, genesWiki, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesWiki <- ldply(sigWiki, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesWiki, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesWiki, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyWiki <- 
  tbl_graph(nodes = nodesWiki, edges = edgesWiki, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaWikiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDE$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTable$Geneid ~ 
        as.integer(row_number(label %in% topTable$Geneid)), 
      id <= nrow(fgseaWikiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[id],
      label %in% topTableDE$Geneid ~ "black"
    ),
    hjust = case_when(
      DE == "ugt5b4" ~ as.integer(1),
      DE == "blvrb" ~ as.integer(2)
    ),
    vjust = case_when(
      DE == "ugt5b4" ~ as.integer(7),
      DE == "blvrb" ~ as.integer(7)
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaWikiTop) ~ rainbow(nrow(fgseaWikiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(27)
# Plot graph
ggraph(tidyWiki, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaWikiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE, hjust = hjust, vjust = vjust), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

RUVSeq

k = 1

# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControl <- topTable %>% 
  dplyr::arrange(desc(P.Value)) %>%
  .[1:10000,] %>%
  .$Geneid
# Run RUVSeq
RUVk1 <- RUVg(dgeFilt$counts, negControl, 1)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeRUVk1 <- dgeFilt
# Replace with results
dgeRUVk1$counts <- RUVk1$normalizedCounts
# Run PCA function
pcaRUVk1 <- dgeRUVk1 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVk1)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 20.11 18.01 16.48 15.75 14.34 13.36 12.55 11.28 7.413e-14
Proportion of Variance 0.2109 0.1691 0.1417 0.1294 0.1073 0.09307 0.0822 0.06639 0
Cumulative Proportion 0.2109 0.38 0.5217 0.6511 0.7583 0.8514 0.9336 1 1
# Plot PCA
pcak1 <- pcaRUVk1$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeRUVk1$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

k = 2

# Run RUVSeq
RUVk2 <- RUVg(dgeFilt$counts, negControl, 2)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeRUVk2 <- dgeFilt
# Replace with results
dgeRUVk2$counts <- RUVk2$normalizedCounts
# Run PCA function
pcaRUVk2 <- dgeRUVk2 %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVk2)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 18.38 17.65 16.58 14.48 13.37 12.59 11.27 0.4422 4.728e-14
Proportion of Variance 0.2113 0.1949 0.1721 0.1311 0.1119 0.09915 0.07951 0.00012 0
Cumulative Proportion 0.2113 0.4062 0.5782 0.7093 0.8212 0.9204 0.9999 1 1
# Plot PCA
pcak2 <- pcaRUVk2$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeRUVk2$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Results

# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.5, width = 0.5, height = 1)
vp2 <- viewport(x = 0.75, y = 0.5, width = 0.5, height = 1)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(pcak1, vp = vp1)
print(pcak2, vp = vp2)

2017 data analysis

Differential Expression Analysis

Load data

# Load Nhi's DGEList
dgeListNhi <- read_rds("nhiData/dge_g.rds") %>%
  .[,rownames(subset(.$samples, Age == 6 & Hypoxia == 0))]
# Convert counts to integers. Nhi used kallisto which is why they are not integers. (This is needed in hindsight because RUVSeq did not accept decimal values)
dgeListNhi$counts <- dgeListNhi$counts %>%
  round()
# Set group variable
dgeListNhi$samples$group <- colnames(dgeListNhi) %>%
  str_extract("(w|q)") %>%
  factor(levels = c("w", "q"))

Add gene information

# Add genesGR to DGEList using rownames of DGEList to reorder the genesGR
dgeListNhi$genes <- genesGR[rownames(dgeListNhi),]

Data QC

# Perform logical test to see how many genes were not detected in dataset
dgeListNhi$counts %>%
  rowSums() %>%
  is_greater_than(0) %>%
  table()
## .
## FALSE  TRUE 
##  1364 24217
# Check for genes having >= 4 samples with cpm > 1
dgeListNhi %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4) %>%
  table()
## .
## FALSE  TRUE 
##  7082 18499
# Create logical vector of genes to keep that fit criteria
genes2keepNhi <- dgeListNhi %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(4)
# Create new DGEList of genes fitting criteria
dgeFiltNhi <- dgeListNhi[genes2keepNhi,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Remove unneeded columns from samples element
dgeFiltNhi$samples <- dgeFiltNhi$samples %>%
  dplyr::select(group, lib.size, norm.factors)
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListNhi %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltNhi %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Library sizes

# Check library sizes with box plot
dgeFiltNhi$samples %>%
  ggplot(aes(group, lib.size, fill = group)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  labs(x = "Genotype", y = "Library Size") +
  scale_fill_discrete(
    name ="Genotype", 
    labels = c("Wildtype","Mutant")
  ) +
  scale_x_discrete(labels=c("w" = "Wildtype", "q" = "Mutant")) +
  theme_bw()

PCA

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaNhi <- dgeFiltNhi %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaNhi)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 21.47 17.03 15.79 14.59 13.5 12.95 5.234e-14 3.079e-14
Proportion of Variance 0.2948 0.1856 0.1595 0.1362 0.1165 0.1073 0 0
Cumulative Proportion 0.2948 0.4804 0.6399 0.7761 0.8927 1 1 1
# Plot PCA
pcaNhi$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFiltNhi$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Differential expression

# Create model matrix
design <- model.matrix(~group, data = dgeFiltNhi$samples)
# Perform exact test on DGEList
topTableNhi <- dgeFiltNhi %>%
  estimateDisp(design = design) %>%
  exactTest() %>%
  topTags(n = Inf) %>%
  .$table %>%
  as_tibble() %>%
  unite("Range", ID.start, ID.end, sep = "-") %>%
  unite("Location", ID.seqnames, Range, ID.strand, sep = ":") %>%
  dplyr::select(Geneid = ID.gene_id, 
                Symbol = ID.gene_name,
                AveExpr = logCPM, logFC, 
                P.Value = PValue, 
                FDR, Location, 
                Entrez = ID.entrezid) %>%
  mutate(DE = FDR < 0.05)
# Volcano plot showing DE genes
topTableNhi %>%
  ggplot(aes(logFC, -log10(P.Value), colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(-log10(P.Value) > 4 | abs(logFC) >                        2.5), aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  theme_bw() +
  theme(legend.position = "none")

# MD Plot showing DE genes
topTableNhi %>%
  dplyr::arrange(desc(P.Value)) %>%
  ggplot(aes(AveExpr, logFC, colour = DE)) +
  geom_point(alpha = 0.5) +
  geom_text_repel(data = . %>% 
                    dplyr::filter(DE) %>%
                    dplyr::filter(abs(logFC) > 2 | AveExpr > 14),
                  aes(label = Symbol)) + 
  scale_colour_manual(values = c("grey", "red")) +
  labs(x = "Average Expression (log2 CPM)",
       y = "log Fold-Change") +
  theme_bw() +
  theme(legend.position = "none")

# Summary of DE genes
topTableDENhi <- topTableNhi %>%
  dplyr::filter(FDR < 0.05) %>%
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR) 
topTableDENhi %>% pander(style = "rmarkdown", split.tables = Inf)
Geneid Symbol AveExpr logFC P.Value FDR

Gene Set Enrichment Analysis (GSEA)

# Create named vector of gene level statistics 
ranksNhi <- topTableNhi %>%
  mutate(stat = -sign(logFC) * log10(P.Value)) %>%
  dplyr::arrange(desc(stat)) %>%
  with(structure(stat, names = Geneid))

Hallmark

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaHallmarkNhi <- fgsea(hallmark, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaHallmarkNhiTop <- fgseaHallmarkNhi %>%
  dplyr::filter(padj < 0.05)
fgseaHallmarkNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 10 most significantly enriched Hallmark pathways. This corresponds to an FDR of 0.468%
pathway pval FDR ES NES size padj
ALLOGRAFT_REJECTION 1.921e-05 0.0002092 0.4589 1.88 196 0.0009606
INTERFERON_GAMMA_RESPONSE 1.922e-05 0.0002092 0.482 1.975 197 0.0009611
MTORC1_SIGNALING 2.091e-05 0.0002092 -0.4538 -1.906 228 0.001046
MYC_TARGETS_V1 2.091e-05 0.0002092 -0.4101 -1.717 221 0.001046
OXIDATIVE_PHOSPHORYLATION 2.092e-05 0.0002092 -0.6225 -2.607 222 0.001046
FATTY_ACID_METABOLISM 8.32e-05 0.0006933 -0.393 -1.618 190 0.00416
CHOLESTEROL_HOMEOSTASIS 0.0002258 0.001613 -0.4947 -1.801 81 0.01129
INTERFERON_ALPHA_RESPONSE 0.0003125 0.001743 0.4818 1.766 89 0.01563
ADIPOGENESIS 0.0003137 0.001743 -0.3733 -1.563 221 0.01569
HEME_METABOLISM 0.0009364 0.004682 -0.3695 -1.533 202 0.04682
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(hallmark[fgseaHallmarkNhi$pathway], 
              ranksNhi, fgseaHallmarkNhi, gseaParam = 0.5)

KEGG

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaKEGGNhi <- fgsea(kegg, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaKEGGNhiTop <- fgseaKEGGNhi %>%
  dplyr::filter(padj < 0.05)
fgseaKEGGNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 19 most significantly enriched KEGG pathways. This corresponds to an FDR of 0.160%
pathway pval FDR ES NES size padj
HEMATOPOIETIC_CELL_LINEAGE 1.943e-05 0.0002968 0.6236 2.417 127 0.003614
AUTOIMMUNE_THYROID_DISEASE 1.962e-05 0.0002968 0.7727 2.166 23 0.00365
GRAFT_VERSUS_HOST_DISEASE 1.962e-05 0.0002968 0.774 2.17 23 0.00365
INTESTINAL_IMMUNE_NETWORK_FOR_IGA_PRODUCTION 1.963e-05 0.0002968 0.7092 2.141 32 0.003651
SYSTEMIC_LUPUS_ERYTHEMATOSUS 1.965e-05 0.0002968 0.6144 2.05 52 0.003655
ASTHMA 1.968e-05 0.0002968 0.7601 2.15 24 0.003661
RIBOSOME 2.047e-05 0.0002968 -0.7016 -2.558 81 0.003808
GLYCOLYSIS_GLUCONEOGENESIS 2.051e-05 0.0002968 -0.5785 -2.16 94 0.003814
FATTY_ACID_METABOLISM 2.051e-05 0.0002968 -0.6238 -2.233 73 0.003814
PARKINSONS_DISEASE 2.054e-05 0.0002968 -0.5595 -2.178 124 0.003821
OXIDATIVE_PHOSPHORYLATION 2.061e-05 0.0002968 -0.6048 -2.364 128 0.003833
HUNTINGTONS_DISEASE 2.072e-05 0.0002968 -0.5037 -2.06 180 0.003854
ALZHEIMERS_DISEASE 2.075e-05 0.0002968 -0.537 -2.17 164 0.003859
ALLOGRAFT_REJECTION 3.928e-05 0.0005219 0.799 2.166 20 0.007307
LEISHMANIA_INFECTION 7.808e-05 0.000913 0.5497 1.925 67 0.01452
TYPE_I_DIABETES_MELLITUS 7.854e-05 0.000913 0.6792 2.021 30 0.01461
RETINOL_METABOLISM 8.371e-05 0.0009159 -0.3749 -1.607 273 0.01557
ABC_TRANSPORTERS 0.0001019 0.001053 -0.572 -1.908 50 0.01896
TYROSINE_METABOLISM 0.0001637 0.001603 -0.5288 -1.86 66 0.03045
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(kegg[fgseaKEGGNhi$pathway], 
              ranksNhi, fgseaKEGGNhi, gseaParam = 0.5)

Wikipathways

# Set seed to enable reproducibility
set.seed(22)
# Run GSEA for hallmark
fgseaWikiNhi <- fgsea(wiki, ranksNhi, nperm=1e5) %>%
  as_tibble() %>%
  dplyr::rename(FDR = padj) %>%
  mutate(padj = p.adjust(pval, "bonferroni")) %>%
  dplyr::arrange(pval)
# Create an object of pathways with adjusted p-value < 0.05 for construction of network diagrams. This should be done differently next time, but too much work has been done to change it now.
fgseaWikiNhiTop <- fgseaWikiNhi %>%
  dplyr::filter(padj < 0.05)
fgseaWikiNhiTop %>%
  dplyr::select(-leadingEdge, -nMoreExtreme) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched Wiki pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 11 most significantly enriched Wiki pathways. This corresponds to an FDR of 0.290%
pathway pval FDR ES NES size padj
Ebola Virus Pathway on Host 1.94e-05 0.001358 0.4834 1.905 144 0.01016
Fatty Acid Omega Oxidation 2.024e-05 0.001358 -0.7852 -2.389 32 0.01061
Metabolic reprogramming in colon cancer 2.03e-05 0.001358 -0.6231 -2.1 53 0.01064
Oxidative phosphorylation 2.033e-05 0.001358 -0.6005 -2.073 60 0.01065
Cytoplasmic Ribosomal Proteins 2.042e-05 0.001358 -0.7016 -2.556 81 0.0107
Amino Acid metabolism 2.052e-05 0.001358 -0.5508 -2.081 102 0.01075
Electron Transport Chain (OXPHOS system in mitochondria) 2.053e-05 0.001358 -0.6399 -2.398 97 0.01076
Nonalcoholic fatty liver disease 2.073e-05 0.001358 -0.4821 -1.933 155 0.01086
Allograft Rejection 3.937e-05 0.002125 0.5915 2.016 58 0.02063
Ethanol effects on histone modifications 4.055e-05 0.002125 -0.6248 -1.983 39 0.02125
Glycolysis and Gluconeogenesis 6.097e-05 0.002904 -0.5741 -1.949 55 0.03195
# Make a table plot of significant Hallmark pathways
if (interactive()) grid::grid.newpage()
plotGseaTable(wiki[fgseaWikiNhi$pathway], 
              ranksNhi, fgseaWikiNhi, gseaParam = 0.5)

Network Analysis

Hallmark

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigHallmarkNhi <-
  fgseaHallmarkNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysHallmarkNhi <- names(sigHallmarkNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesHallmarkNhi <- unique(unlist(sigHallmarkNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesHallmarkNhi <- full_join(pathwaysHallmarkNhi, genesHallmarkNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesHallmarkNhi <- ldply(sigHallmarkNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesHallmarkNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesHallmarkNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyHallmarkNhi <- 
  tbl_graph(nodes = nodesHallmarkNhi, 
            edges = edgesHallmarkNhi, 
            directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaHallmarkNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaHallmarkNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaHallmarkNhiTop) ~ rainbow(nrow(fgseaHallmarkNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaHallmarkNhiTop) ~ 
        rainbow(nrow(fgseaHallmarkNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyHallmarkNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaHallmarkNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour),
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

KEGG

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigKEGGNhi <-
  fgseaKEGGNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysKEGGNhi <- names(sigKEGGNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesKEGGNhi <- unique(unlist(sigKEGGNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesKEGGNhi <- full_join(pathwaysKEGGNhi, genesKEGGNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesKEGGNhi <- ldply(sigKEGGNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesKEGGNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesKEGGNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyKEGGNhi <- 
  tbl_graph(nodes = nodesKEGGNhi, edges = edgesKEGGNhi, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaKEGGNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaKEGGNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaKEGGNhiTop) ~ rainbow(nrow(fgseaKEGGNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyKEGGNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaKEGGNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour), 
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black"
  ) +
  theme_graph() +
  theme(legend.position = "none")

Wikipathways

# Load significant pathways with ONLY leading edge genes determined from GSEA analysis
sigWikiNhi <-
  fgseaWikiNhiTop %>%
  split(f = .$pathway) %>% 
  lapply(extract2, "leadingEdge") %>% 
  lapply(unlist)
# Create a node list
pathwaysWikiNhi <- names(sigWikiNhi) %>%
  as.data.frame() %>%
  set_colnames("label") %>%
  mutate(label = as.character(label))
genesWikiNhi <- unique(unlist(sigWikiNhi)) %>% 
  as.data.frame() %>% 
  set_colnames("label") %>%
  mutate(label = as.character(label))
nodesWikiNhi <- full_join(pathwaysWikiNhi, genesWikiNhi, by = "label") %>%
  rowid_to_column("id")
# Then create an edge list
edgesWikiNhi <- ldply(sigWikiNhi, data.frame) %>% 
  set_colnames(c("pathway", "gene")) %>%
  mutate(gene = as.character(gene)) %>%
  left_join(nodesWikiNhi, by = c("pathway" = "label")) %>%
  dplyr::rename(from = id) %>%
  left_join(nodesWikiNhi, by = c("gene" = "label")) %>%
  dplyr::rename(to = id) %>%
  dplyr::select(from, to)
# Create tidygraph object
tidyWikiNhi <- 
  tbl_graph(nodes = nodesWikiNhi, edges = edgesWikiNhi, directed = FALSE) %>%
  activate(nodes) %>%
  left_join(idConvertSymbol, by = "label") %>%
  mutate(
    pathways = case_when(
      id <= nrow(fgseaWikiNhiTop) ~ label
    ),
    DE = case_when(
      label %in% topTableDENhi$Geneid ~ symbol
    ),
    size = case_when(
      label %in% topTableNhi$Geneid ~ 
        as.integer(row_number(label %in% topTableNhi$Geneid)), 
      id <= nrow(fgseaWikiNhiTop) ~ as.integer(4000)
    ),
    colour = case_when(
      id <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[id],
      label %in% topTableDENhi$Geneid ~ "black"
    )
  ) %>%
  activate(edges) %>%
  mutate(
    colour = case_when(
      from <= nrow(fgseaWikiNhiTop) ~ rainbow(nrow(fgseaWikiNhiTop))[from]
    )
  )
# Set seed to enable reproducibility (seed selected to create graph with non-overlapping labels)
set.seed(22)
# Plot graph
ggraph(tidyWikiNhi, layout = "fr") +
  scale_fill_manual(
    values = c(rainbow(nrow(fgseaWikiNhiTop)), "black"), 
    na.value = "gray80"
  ) +
  geom_edge_arc(
    aes(color = colour),
    alpha = 0.5, 
    show.legend = FALSE, 
    curvature = 0.5
  ) +
  geom_node_point(
    aes(size = size, fill = colour), 
    shape = 21, 
    stroke = 0.5, 
    show.legend = FALSE
  ) +
  geom_node_label(
    aes(label = pathways), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.7, 
    label.padding = 0.1
  ) +
  geom_node_text(
    aes(label = DE), 
    repel = TRUE, 
    size = 3, 
    alpha = 0.8, 
    colour = "black") +
  theme_graph() +
  theme(legend.position = "none")

RUVSeq

k = 1

# Extract subset of low expressed genes from DE analysis to act as negative controls for RUVg procedure
negControlNhi <- topTableNhi %>% 
  dplyr::arrange(desc(P.Value)) %>%
  .[1:10000,] %>%
  .$Geneid
# Run RUVSeq
RUVk1Nhi <- RUVg(dgeFiltNhi$counts, negControlNhi, 1)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeRUVk1Nhi <- dgeFiltNhi
# Replace with results
dgeRUVk1Nhi$counts <- RUVk1Nhi$normalizedCounts
# Run PCA function
pcaRUVk1Nhi <- dgeRUVk1Nhi %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVk1Nhi)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 43.52 18.44 17.09 15.81 14.55 13.32 7.204e-14 5.701e-14
Proportion of Variance 0.5984 0.1074 0.09228 0.07896 0.06686 0.05606 0 0
Cumulative Proportion 0.5984 0.7058 0.7981 0.8771 0.9439 1 1 1
# Plot PCA
pcak1Nhi <- pcaRUVk1Nhi$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeRUVk1Nhi$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

k = 2

# Run RUVSeq
RUVk2Nhi <- RUVg(dgeFiltNhi$counts, negControlNhi, 2)
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeRUVk2Nhi <- dgeFiltNhi
# Replace with results
dgeRUVk2Nhi$counts <- RUVk2Nhi$normalizedCounts
# Run PCA function
pcaRUVk2Nhi <- dgeRUVk2Nhi %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVk2Nhi)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
Standard deviation 43.72 17.15 15.82 14.75 13.38 0.5608 5.785e-14 1.297e-14
Proportion of Variance 0.6701 0.1032 0.08769 0.07627 0.06273 0.00011 0 0
Cumulative Proportion 0.6701 0.7732 0.8609 0.9372 0.9999 1 1 1
# Plot PCA
pcak2Nhi <- pcaRUVk2Nhi$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeRUVk2Nhi$samples, "sample")) %>%
  ggplot(aes(PC1, PC2, colour = group, label = sample)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw()

Results

# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.5, width = 0.5, height = 1)
vp2 <- viewport(x = 0.75, y = 0.5, width = 0.5, height = 1)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(pcak1Nhi, vp = vp1)
print(pcak2Nhi, vp = vp2)

Integration of 2019 data with 2017 data

Meta Analysis

Hallmark

# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017Hallmark <- fgseaHallmarkNhi %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2017 = pval)
p2019Hallmark <- fgseaHallmark %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2019 = pval)
fgseaHallmarkMeta <- full_join(p2017Hallmark, p2019Hallmark) %>%
  replace(is.na(.), 0) %>%
  dplyr::mutate(
    chiSquare = -2 * (log10p2017 + log10p2019),
    pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
    FDR = p.adjust(pCombined, "fdr")
  ) %>%
  dplyr::arrange(pCombined)

KEGG

# Extract p-values from 2017 and 2019 GSEA results and combine using Fisher's method
p2017KEGG <- fgseaKEGGNhi %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2017 = pval)
p2019KEGG <- fgseaKEGG %>%
  dplyr::select(pathway, pval) %>%
  dplyr::mutate(pval = log10(pval)) %>%
  dplyr::rename(log10p2019 = pval)
fgseaKEGGMeta <- full_join(p2017KEGG, p2019KEGG) %>%
  replace(is.na(.), 0) %>%
  dplyr::mutate(
    chiSquare = -2 * (log10p2017 + log10p2019),
    pCombined = pchisq(chiSquare, df = 4, lower.tail = FALSE),
    FDR = p.adjust(pCombined, "fdr")
  ) %>%
  dplyr::arrange(pCombined)

Results

# Hallmark
fgseaHallmarkMeta %>%
  dplyr::filter(FDR < 0.05) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched hallmark pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 7 most significantly enriched hallmark pathways. This corresponds to an FDR of 4.24%
pathway log10p2017 log10p2019 chiSquare pCombined FDR
INTERFERON_GAMMA_RESPONSE -4.716 -4.903 19.24 0.0007057 0.009079
ALLOGRAFT_REJECTION -4.716 -4.901 19.24 0.0007066 0.009079
MYC_TARGETS_V1 -4.68 -4.908 19.18 0.000726 0.009079
OXIDATIVE_PHOSPHORYLATION -4.68 -4.908 19.17 0.0007263 0.009079
MTORC1_SIGNALING -4.68 -3.706 16.77 0.00214 0.01835
INTERFERON_ALPHA_RESPONSE -3.505 -4.849 16.71 0.002202 0.01835
ADIPOGENESIS -3.503 -3.732 14.47 0.005934 0.04239
# KEGG
fgseaKEGGMeta %>%
  dplyr::filter(FDR < 0.05) %>%
  pander(
    style = "rmarkdown", 
    split.tables = Inf, 
    justify = "lrrrrr", 
    caption = paste(
      "The", nrow(.), "most significantly enriched KEGG pathways.",
      "This corresponds to an FDR of", percent(max(.$FDR)))
  )
The 6 most significantly enriched KEGG pathways. This corresponds to an FDR of 4.48%
pathway log10p2017 log10p2019 chiSquare pCombined FDR
HUNTINGTONS_DISEASE -4.684 -4.894 19.16 0.0007325 0.02834
ALZHEIMERS_DISEASE -4.683 -4.888 19.14 0.0007368 0.02834
OXIDATIVE_PHOSPHORYLATION -4.686 -4.874 19.12 0.0007444 0.02834
PARKINSONS_DISEASE -4.687 -4.871 19.12 0.0007452 0.02834
RIBOSOME -4.689 -4.846 19.07 0.0007617 0.02834
SYSTEMIC_LUPUS_ERYTHEMATOSUS -4.707 -4.117 17.65 0.001447 0.04485

PCA

Set up DGE List and QC data

# Combine counts into the same DGE List
# First extract counts from 2017 DGE List
countsNhi <- dgeListNhi$counts %>%
  as.data.frame() %>%
  rownames_to_column("Geneid") %>%
  set_colnames(str_remove(colnames(.), "_6_0")) %>%
  set_colnames(str_replace(colnames(.), "q96", "Q")) %>%
  set_colnames(str_replace(colnames(.), "wt", "W")) %>%
  as_tibble()
# Then join with 2019 counts
dgeListComb <- full_join(counts, countsNhi) %>%
  replace(is.na(.), 0) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
# Set group variable
dgeListComb$samples$group <- colnames(dgeListComb) %>%
  str_extract("(W|Q)") %>%
  factor(levels = c("W", "Q"))
# Create logical vector of genes to keep that fit criteria
genes2keepComb <- dgeListComb %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(8)
# Create new DGEList of genes fitting criteria
dgeFiltComb <- dgeListComb[genes2keepComb,, 
                                   keep.lib.sizes = FALSE] %>%
  calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeListComb %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "Before Filtering")
dgeFiltComb %>%
  cpm(log = TRUE) %>%
  plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))

Plot PCA graphs

# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pcaComb <- dgeFiltComb %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander(split.tables = Inf)
  PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 22.27 18.07 16.75 14.73 14.45 13.34 11.87 11.2 5.671e-14
Proportion of Variance 0.2513 0.1655 0.1421 0.1099 0.1058 0.09023 0.07145 0.06362 0
Cumulative Proportion 0.2513 0.4168 0.559 0.6689 0.7747 0.8649 0.9364 1 1
# Create PCA plots
pca12 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC2) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC1, PC2, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = c(1.2, 0.5))
pca13 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC1, PC3) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC1, PC3, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
pca23 <- pcaComb$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  dplyr::select(sample, PC2, PC3) %>%
  left_join(rownames_to_column(dgeFiltComb$samples, "sample")) %>%
  mutate(
    data = case_when(
      str_detect(.$sample, "_") == TRUE ~ "2017",
      str_detect(.$sample, "_") == FALSE ~ "2019"
    )
  ) %>%
  ggplot(aes(PC2, PC3, colour = data, label = sample, shape = group)) +
  geom_point() +
  geom_text_repel(show.legend = FALSE) +
  theme_bw() +
  theme(legend.position = "none")
# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.25, width = 0.5, height = 0.5)
vp2 <- viewport(x = 0.25, y = 0.75, width = 0.5, height = 0.5)
vp3 <- viewport(x = 0.75, y = 0.75, width = 0.5, height = 0.5)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(pca12, vp = vp1)
print(pca13, vp = vp2)
print(pca23, vp = vp3)